import pandas as pd
import numpy as np
import dalex as dx
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OrdinalEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import recall_score, roc_auc_score
from sklearn.model_selection import StratifiedKFold, train_test_split, GridSearchCV, cross_val_score
from sklearn_extra.cluster import KMedoids
import warnings
warnings.filterwarnings('ignore')
df = pd.read_csv('DATA.csv')
df.head()
| Patient_ID | Systemic Illness | Rectal Pain | Sore Throat | Penile Oedema | Oral Lesions | Solitary Lesion | Swollen Tonsils | HIV Infection | Sexually Transmitted Infection | MonkeyPox | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | P0 | None | False | True | True | True | False | True | False | False | Negative |
| 1 | P1 | Fever | True | False | True | True | False | False | True | False | Positive |
| 2 | P2 | Fever | False | True | True | False | False | False | True | False | Positive |
| 3 | P3 | None | True | False | False | False | True | True | True | False | Positive |
| 4 | P4 | Swollen Lymph Nodes | True | True | True | False | False | True | True | False | Positive |
print(f'There are {df.shape[0]} patients in dataset')
There are 25000 patients in dataset
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 25000 entries, 0 to 24999 Data columns (total 11 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Patient_ID 25000 non-null object 1 Systemic Illness 25000 non-null object 2 Rectal Pain 25000 non-null bool 3 Sore Throat 25000 non-null bool 4 Penile Oedema 25000 non-null bool 5 Oral Lesions 25000 non-null bool 6 Solitary Lesion 25000 non-null bool 7 Swollen Tonsils 25000 non-null bool 8 HIV Infection 25000 non-null bool 9 Sexually Transmitted Infection 25000 non-null bool 10 MonkeyPox 25000 non-null object dtypes: bool(8), object(3) memory usage: 781.4+ KB
df.describe().T
| count | unique | top | freq | |
|---|---|---|---|---|
| Patient_ID | 25000 | 25000 | P0 | 1 |
| Systemic Illness | 25000 | 4 | Fever | 6382 |
| Rectal Pain | 25000 | 2 | False | 12655 |
| Sore Throat | 25000 | 2 | True | 12554 |
| Penile Oedema | 25000 | 2 | True | 12612 |
| Oral Lesions | 25000 | 2 | False | 12514 |
| Solitary Lesion | 25000 | 2 | True | 12527 |
| Swollen Tonsils | 25000 | 2 | True | 12533 |
| HIV Infection | 25000 | 2 | True | 12584 |
| Sexually Transmitted Infection | 25000 | 2 | False | 12554 |
| MonkeyPox | 25000 | 2 | Positive | 15909 |
print(f'Missing values: {sum(df.isna().sum()) + sum(df.isnull().sum())} ')
Missing values: 0
Patient_ID¶df.drop('Patient_ID', axis=1, inplace=True)
MonkeyPox) in percents¶df['MonkeyPox'].value_counts(normalize=True)
Positive 0.63636 Negative 0.36364 Name: MonkeyPox, dtype: float64
palette = sns.color_palette('pastel')
fig, ax = plt.subplots(figsize=(8, 4))
sns.countplot(y='MonkeyPox', data=df, palette=palette, ax=ax)
ax.set_title('Distribution of Monkey Pox', fontsize=15);
Systemic Illness between people having Monkey Pox and not¶Fever and Swollen Lymph Nodes dominate. While among not infected people these illnesses are significantly rare than others¶fig, ax = plt.subplots(1, 2, figsize=(15, 5))
illness_dist_pos = df[df['MonkeyPox'] == 'Positive']['Systemic Illness'].value_counts().sort_index()
ax[0].pie(x=illness_dist_pos.values,
labels=illness_dist_pos.index,
colors=palette,
autopct='%.0f%%')
ax[0].set_title('Distribution of systemic illness between infected', fontsize=15);
illness_dist_neg = df[df['MonkeyPox'] == 'Negative']['Systemic Illness'].value_counts().sort_index()
ax[1].pie(x=illness_dist_neg.values,
labels=illness_dist_neg.index,
colors=palette,
autopct='%.0f%%')
ax[1].set_title('Distribution of systemic illness between not infected', fontsize=15);
fig, ax = plt.subplots(2, 4, figsize=(20, 8))
ax = ax.flatten()
for idx, feature in enumerate(df.columns.drop(['MonkeyPox', 'Systemic Illness'])):
sns.countplot(x=feature, hue='MonkeyPox', data=df, palette=palette, ax=ax[idx])
ax[idx].set_title(feature, fontsize=20)
ax[idx].set(ylabel=None, xlabel=None)
ax[idx].tick_params(axis='both', labelsize=12)
OrdinalEncoder to make data numerical¶df_enc = pd.DataFrame(OrdinalEncoder().fit_transform(df), columns=df.columns)
df_enc.head()
| Systemic Illness | Rectal Pain | Sore Throat | Penile Oedema | Oral Lesions | Solitary Lesion | Swollen Tonsils | HIV Infection | Sexually Transmitted Infection | MonkeyPox | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2.0 | 0.0 | 1.0 | 1.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 |
| 1 | 0.0 | 1.0 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 |
| 2 | 0.0 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 |
| 3 | 2.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 1.0 | 1.0 | 0.0 | 1.0 |
| 4 | 3.0 | 1.0 | 1.0 | 1.0 | 0.0 | 0.0 | 1.0 | 1.0 | 0.0 | 1.0 |
x, y = df_enc.drop('MonkeyPox', axis=1), df_enc['MonkeyPox']
rfc = RandomForestClassifier(n_estimators=100, n_jobs=-1, random_state=17)
rfc.fit(x, y)
RandomForestClassifier(n_jobs=-1, random_state=17)
exp = dx.Explainer(rfc, x, y)
Preparation of a new explainer is initiated -> data : 25000 rows 9 cols -> target variable : Parameter 'y' was a pandas.Series. Converted to a numpy.ndarray. -> target variable : 25000 values -> model_class : sklearn.ensemble._forest.RandomForestClassifier (default) -> label : Not specified, model's class short name will be used. (default) -> predict function : <function yhat_proba_default at 0x7f4ca78e9040> will be used (default) -> predict function : Accepts pandas.DataFrame and numpy.ndarray. -> predicted values : min = 0.0, mean = 0.636, max = 1.0 -> model type : classification will be used (default) -> residual function : difference between y and yhat (default) -> residuals : min = -0.971, mean = 5.13e-05, max = 0.963 -> model_info : package sklearn A new explainer has been created!
Permutation importance, where the average reduction in accuracy caused by a variable removing is its importance score. The greater the reduction in accuracy the greater the score. But as far it is not computationally easy to delete particular feature, we simply shuffle it thus simulating the removal¶exp.model_parts().plot()
Systemic Illness has 4 unique values (in our encoded dataset 0, 1, 2, 3). And for values 0 and 3 average output is about 0.75, which is closer to 1(MonkeyPox: Positive), while 1 and 2 are closer to 0¶Systemic Illness)¶Swollen Tonsils, Solidarity Lesions¶exp.model_profile().plot()
Calculating ceteris paribus: 100%|██████████████████████████████████████████████| 9/9 [00:00<00:00, 26.17it/s]
df_list = [d for _, d in df_enc.groupby(['MonkeyPox'])]
medoids = {}
for d in df_list:
kmedoids = KMedoids(n_clusters=1, random_state=17)
kmedoids.fit(d.drop('MonkeyPox', axis=1))
medoids[d['MonkeyPox'].iloc[0]] = kmedoids.cluster_centers_[0]
medoids
{0.0: array([2., 0., 0., 0., 0., 0., 0., 0., 0.]),
1.0: array([1., 1., 1., 1., 1., 1., 1., 1., 1.])}
for key in medoids:
exp.predict_parts(medoids[key], type='break_down_interactions').plot()
for key in medoids:
print(f'Breakdown for the most representative sample of target {key}')
exp.predict_parts(medoids[key], type='shap').plot()
Breakdown for the most representative sample of target 0.0
Breakdown for the most representative sample of target 1.0
score = cross_val_score(RandomForestClassifier(n_jobs=-1, random_state=17),
x, y, scoring='roc_auc',
cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=17))
print(f'Roc-auc of the model: {np.mean(score)}')
Roc-auc of the model: 0.6682451345618627
score = cross_val_score(RandomForestClassifier(n_jobs=-1, random_state=17),
x, y, scoring='recall',
cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=17))
print(f'Recall of the model: {np.mean(score)}')
Recall of the model: 0.8597030095608135
GridSearchCV¶params = {'n_estimators': [10, 50, 100, 150],
'max_features': [3, 6, 9],
'min_samples_leaf': [1, 3, 5, 7],
'max_depth': [3, 5, 10, 15, 20]}
gr = GridSearchCV(estimator=RandomForestClassifier(n_jobs=-1, random_state=17),
param_grid=params,
cv=StratifiedKFold(n_splits=3, shuffle=True, random_state=17),
scoring='recall',
n_jobs=-1)
gr.fit(x, y)
GridSearchCV(cv=StratifiedKFold(n_splits=3, random_state=17, shuffle=True),
estimator=RandomForestClassifier(n_jobs=-1, random_state=17),
n_jobs=-1,
param_grid={'max_depth': [3, 5, 10, 15, 20],
'max_features': [3, 6, 9],
'min_samples_leaf': [1, 3, 5, 7],
'n_estimators': [10, 50, 100, 150]},
scoring='recall')
print(f'Locally best RandomForest: {gr.best_score_}')
print(gr.best_params_)
Locally best RandomForest: 0.9792570243258534
{'max_depth': 3, 'max_features': 3, 'min_samples_leaf': 1, 'n_estimators': 100}